This document describes the workflow for the “Water Demand and Supply – a Jevons Paradox?” analysis. In contains code and selected plots.
library(rstan)
library(bayesplot)
library(tidyverse)
library("rethinking")
library(reshape2)
library(rstanarm)
library(corrplot)
library("PerformanceAnalytics")
rstan_options(auto_write = TRUE)
library(cowplot)
africa <- read.csv2("Data/africa_flat.csv", header = T)
america <- read.csv2("Data/americas_flat.csv", header = T)
asia <- read.csv2("Data/asia_flat.csv", header = T)
europe <- read.csv2("Data/europe_flat.csv", header = T)
ocean <- read.csv2("Data/ocean_flat.csv", header = T)
data = rbind(africa,america,asia,europe,ocean)
dam_cap <- subset(data, Variable.Id == 4197) #total dam capacity
dam_cap_per <- subset(data, Variable.Id == 4471) #dam capacity per capita
tot_wat <- subset(data, Variable.Id == 4253) #total water withdrawl
tot_wat_per <- subset(data, Variable.Id == 4257) #total water withdrawl per capita
agr_wat <- subset(data, Variable.Id == 4250) #agricultural withdrawl
ind_wat <- subset(data, Variable.Id == 4252) #industrial withdrawl
pop_tot <- subset(data, Variable.Id == 4104) #total population
pop_dens <- subset(data, Variable.Id == 4107) #population density
pop_under <- subset(data, Variable.Id == 4474) #prevalence of undernourishment
pop_access <- subset(data, Variable.Id == 4114) #Total population with access to safe drinking-water
irri1 <- subset(data, Variable.Id == 4328) #% of the area equipped for irrigation actually irrigated
irri2 <- subset(data, Variable.Id == 4330) #% of irrigation potential equipped for irrigation
data_sub <- rbind(dam_cap,dam_cap_per,
tot_wat,tot_wat_per,agr_wat,ind_wat,
pop_tot,pop_dens,pop_under,pop_access,
irri1,irri2)
#remove columns
data_sub_long <- data_sub[,-c(2,4,7:8)]
#transform to wide format
data_sub_wide <-data_sub_long %>%
dplyr::group_by_at(vars(c(-Value))) %>% # group by everything other than the value column.
dplyr::mutate(row_id=1:n()) %>% ungroup() %>% # build group index
spread(key=Variable.Name , value =Value) %>% # spread
dplyr::select(-row_id) # drop the index
#aggreagate data into AQUASTAT timesteps
#create bins
data_sub_wide$bins <- cut(data_sub_wide$Year,
breaks=seq(from = 1958, to = 2018, 5),
labels=seq(from = 1960, to = 2015, 5))
data_sub_wide <- data_sub_wide[,-2]
#calculate mean based on bins
data_sub_wide_mean <- data_sub_wide %>%
group_by(Area, bins) %>%
summarize(dam_cap = mean(`Total dam capacity`, na.rm = TRUE),
dam_cap_per = mean(`Dam capacity per capita`, na.rm = TRUE),
tot_wat = mean(`Total water withdrawal`, na.rm = TRUE),
tot_wat_per = mean(`Total water withdrawal per capita`, na.rm = TRUE),
agr_wat = mean(`Agricultural water withdrawal`, na.rm = TRUE),
ind_wat = mean(`Industrial water withdrawal`, na.rm = TRUE),
pop_tot = mean(`Total population`, na.rm = TRUE),
pop_dens = mean(`Population density`, na.rm = TRUE),
pop_under = mean(`Prevalence of undernourishment (3-year average)`, na.rm = TRUE),
pop_access = mean(`Total population with access to safe drinking-water (JMP)`, na.rm = TRUE),
irri1 = mean(`% of irrigation potential equipped for irrigation`, na.rm = TRUE),
irri2 = mean(`% of the area equipped for irrigation actually irrigated`, na.rm = TRUE))
dat <- data_sub_wide_mean
# create subset for analysis
sub_tot <- dat[,c(1,2,4,6,10,11,12)] # all variables of interest
sub_1 <- dat[,c(1,2,4)] # Area, bins, dam_cap_per
sub_2 <- dat[,c(1,2,6)] # Area, bins, withdrawal_per
sub_2.1 <- dat[,c(1,2,4,6)] # Area, bins,dam_cap_per, withdrawal_per
sub_3 <- dat[,c(1,2,4,6,10)] # Area, bins, dam_cap_per, withdrawal_per, pop_dens
sub_4 <- dat[,c(1,2,11)] # Area, bins, pop_under
sub_5 <- dat[,c(1,2,12)] # Area, bins, pop_access
sub_5.1 <- dat[,c(1,2,4,6,12)] # Area, bins, pop_access
# choose subest to work with
sub <- sub_3
# filter out countires & remove NAs
sub <- subset(sub, !Area %in% c("SUR|Suriname","TKM|Turkmenistan","ISL|Iceland","CAN|Canada","SGP|Singapore","BGD|Bangladesh","GHA|Ghana"))
sub_na <- na.omit(sub)
sub_na <- subset(sub_na, dam_cap_per > 0)
length(unique(sub_na$Area))
## [1] 138
#calulate data availability
data_size<- sub_na %>%
group_by(Area) %>%
summarise(count=n())
mean(data_size$count)
## [1] 3.275362
#z-score normalisation
dat_z <- sub_na
# names of variables I don't want to scale
varnames <- c("Area", "bins")
# index vector of columns which must not be scaled
index <- names(dat_z) %in% varnames
# scale only the columns not in index
temp <- scale(dat_z[, !index])
dat_z[, !index] <- temp
#min-max normalisation
dat_norm <- sub_na
varnames <- c("Area", "bins")
index <- names(dat_norm) %in% varnames
temp <- as.data.frame(lapply(dat_norm[, !index], normalize))
dat_norm[, !index] <- temp
#choose data set
sub_na = dat_z
#histograms
hist(sub_na$tot_wat_per, nclass = 100)
hist(sub_na$dam_cap_per,nclass = 100)
hist(sub_na$pop_dens,nclass = 100)
ggplot(sub_na, aes(x=tot_wat_per, y= dam_cap_per)) +
geom_point() +
#scale_y_log10() +
#scale_x_log10() +
labs(y="Dam capacity per capita",
x="Total water withdrawal per capita",
title="Supply-Demand Relationship")
ggplot(sub_na, aes(x=pop_dens, y= dam_cap_per, col = bins)) +
geom_point() +
#scale_y_log10() +
#scale_x_log10() +
labs(y="Dam capacity per capita",
x="Population density",
title="Supply-Demand Relationship")
ggplot(sub_na, aes(x=bins, y= dam_cap_per)) +
geom_point() +
#scale_y_log10() +
#scale_x_log10() +
labs(y="Dam capacity per capita",
x="bins",
title="Supply-Demand Relationship")
curve( dnorm( x , 0 , 1 ) , from= -5 , to = 5 )
curve( dcauchy( x , 0 , 1 ) , from= -5 , to= 5 )
# beta 1
N <- 50
a <- rnorm( N , 0 , 0.2)
b <- rnorm( N , 0.5, 0.5 )
plot( NULL , xlim=range(sub_na$tot_wat_per) , ylim=c(-5,5) ,
xlab="tot_wat_per" , ylab="capacity" )
abline( h=0 , lty=1 )
abline( h=0 , lty=1 , lwd=0.5 )
mtext( "b ~ dnorm(0,1)" )
xbar <- mean(sub_na$tot_wat_per)
for ( i in 1:N ) curve( a[i] + b[i]*(x - xbar) ,
from=min(sub_na$tot_wat_per) , to=max(sub_na$tot_wat_per) , add=TRUE ,col=col.alpha("black",0.2) )
# beta 2
N <- 50
a <- rnorm( N , 0 , 0.2)
b <- rnorm( N , -0.5, 0.5 )
plot( NULL , xlim=range(sub_na$tot_wat_per) , ylim=c(-5,5) ,
xlab="tot_wat_per" , ylab="capacity" )
abline( h=0 , lty=1 )
abline( h=0 , lty=1 , lwd=0.5 )
mtext( "b ~ dnorm(0,1)" )
xbar <- mean(sub_na$pop_dens)
for ( i in 1:N ) curve( a[i] + b[i]*(x - xbar) ,
from=min(sub_na$tot_wat_per) , to=max(sub_na$tot_wat_per) , add=TRUE ,col=col.alpha("black",0.2))
###Prior predictive simulation 2
rm(data_all)
data_all <- data.frame()
N= 20
for (i in 1:N){
Nsim = length(sub_na$tot_wat_per)
sigma <- abs(rnorm(1, 0, 1))
beta0 <- rnorm(1, 0, 0.2)
beta1 <- rnorm(1, 0.5, 0.5)
beta2 <- rnorm(1, -0.5, 0.5)
data1 <- data.frame(
tot_cap = sub_na$tot_wat_per,
dam_cap = sub_na$dam_cap_per,
pop_d = sub_na$pop_dens,
sim_tot= beta0 + beta1* sub_na$tot_wat_per + beta2*sub_na$pop_dens + rnorm(Nsim, mean = 0, sd = sigma),
sim_with = beta0 + beta1 * sub_na$tot_wat_per + rnorm(Nsim, mean = 0, sd = sigma),
sim_dens = beta0 + beta2* sub_na$pop_dens + rnorm(Nsim, mean = 0, sd = sigma)
)
data_all <- rbind(data_all,data1)
}
theme_set(bayesplot::theme_default(base_size = 18))
theme_update(axis.text = element_text(size = 20))
p_comp <- ggplot(data_all, aes(x = dam_cap, y = sim_tot)) +
geom_point(alpha = 0.5, color = "red") +
labs(title = "Observed vs. Simulated",
x = "Observed dam capacity ",
y = "Simulated dam capacity")#,
p_tot <- ggplot(data_all, aes(x = tot_cap, y = sim_tot)) +
geom_point(alpha = 0.5, color = "red") +
labs(title = "Multiple Regression",
x = "Additive predictors",
y = "Simulated dam capacity")
p_dens <- ggplot(data_all, aes(x = pop_d, y = sim_dens)) +
geom_point(alpha = 0.5, color = "blue") +
labs(title = "Simple Regression",
x = "Population Density",
y = "Simulated dam capacity")
p_with <- ggplot(data_all, aes(x = tot_cap, y = sim_with)) +
geom_point(alpha = 0.5, color = "blue") +
labs(title = "Simple Regression",
x = "Total water withdrawal per capita",
y = "Simulated dam capacity")
p_dens
p_with
p_tot
p_comp
#create prior for model fit
my_prior <- normal(location = c(0.5, -0.5), scale = c(0.5, 0.5), autoscale = FALSE)
#Null model
fit0 <- stan_glm(dam_cap_per ~ 1,
data = sub_na,
prior_intercept = normal(0, 0.2),
family = gaussian(link = "identity"))
#Simple linear mdel
fit1 <- stan_glm(dam_cap_per ~ tot_wat_per,
data = sub_na,
prior = normal(1, 0.5),
prior_intercept = normal(0, 0.2),
prior_aux = normal(0, 1),
family = gaussian(link = "identity"))
#Multiple linear model
fit1.1 <- stan_glm(dam_cap_per ~ tot_wat_per + pop_dens,
data = sub_na,
prior = my_prior,
prior_intercept = normal(0, 0.2),
prior_aux = normal(0, 1),
QR = TRUE,
family = gaussian(link = "identity"))
loo0 <- loo(fit0)
loo1 <- loo(fit1)
loo1.1 <- loo(fit1.1)
(comp <- compare_models(loo0,loo1,loo1.1))
#choose model based on comparison
fit <- fit1.1
summary(fit)
prior_summary(fit)
n_draws <- 4000
# Draw from the linear predictor, no residual variance
posterior_line_draws <- posterior_linpred(fit, draws = n_draws)
posterior_line_draws_plot <- posterior_line_draws %>%
as.data.frame(.) %>%
gather(.) %>%
mutate(draw = rep(1:n_draws, length(fit$data$tot_wat_per)),
x = rep(fit$data$tot_wat_per, each = n_draws))
ggplot(posterior_line_draws_plot %>%
filter(draw %in% sample(1:n_draws, 30))) +
geom_point(data = sub_na, aes(x = tot_wat_per, y = dam_cap_per), col = "grey") +
geom_line(aes(x = x, y = value, group = draw), alpha = 0.3) +
theme_bw()
posterior_line_draws_plot_summary <- posterior_line_draws_plot %>%
group_by(x) %>%
summarize(q025 = quantile(value, 0.025),
q25 = quantile(value, 0.25),
q50 = quantile(value, 0.5),
q75 = quantile(value, 0.75),
q975 = quantile(value, 0.975))
ggplot(posterior_line_draws_plot_summary) +
geom_point(data = sub_na, aes(x = tot_wat_per, y = dam_cap_per), col = "grey") +
geom_ribbon(aes(x = x, ymin = q025, ymax = q975), alpha = 0.4) +
geom_ribbon(aes(x = x, ymin = q25, ymax = q75), alpha = 0.8) +
geom_line(aes(x = x, y = q50)) +
theme_bw() +
labs(title = "Parameter uncertainty")
# Draw from the whole model with residual variance
posterior_draws <- posterior_predict(fit, draws = n_draws)
posterior_draws_plot <- posterior_draws %>%
as.data.frame(.) %>%
gather(.) %>%
mutate(draw = rep(1:n_draws, length(fit$data$tot_wat_per)),
x = rep(fit$data$tot_wat_per, each = n_draws))
posterior_draws_plot_summary <- posterior_draws_plot %>%
group_by(x) %>%
summarize(q025 = quantile(value, 0.025),
q25 = quantile(value, 0.25),
q50 = quantile(value, 0.5),
q75 = quantile(value, 0.75),
q975 = quantile(value, 0.975))
ggplot(posterior_draws_plot_summary) +
geom_point(data = sub_na, aes(x = tot_wat_per, y = dam_cap_per), col = "grey") +
geom_ribbon(aes(x = x, ymin = q025, ymax = q975), alpha = 0.4) +
geom_ribbon(aes(x = x, ymin = q25, ymax = q75), alpha = 0.8) +
geom_line(aes(x = x, y = q50)) +
theme_bw() +
labs(title = "Model uncertainty",
x = "Additive predictors",
y = "Dam capacity per capita")
# posterior uncertainty intervals
mcmc_intervals(as.array(fit), pars = c("(Intercept)", "tot_wat_per","pop_dens", "sigma")) #Central posterior uncertainty intervals
mcmc_areas(as.array(fit), pars = c("(Intercept)", "tot_wat_per","pop_dens", "sigma"))
posterior_interval(as.matrix(fit), prob = 0.9, type = "central",
pars = c("(Intercept)", "tot_wat_per","pop_dens", "sigma"))
# univariate marginal posterior distributions
mcmc_hist(as.array(fit), pars = c("(Intercept)", "tot_wat_per","pop_dens", "sigma")) #plots marginal posterior distributions (combining all chains):
mcmc_hist_by_chain(as.array(fit), pars = c("(Intercept)", "tot_wat_per","pop_dens", "sigma")) #view separate histograms of each of the four Markov chains
mcmc_dens_overlay(as.array(fit), pars = c("(Intercept)", "tot_wat_per","pop_dens", "sigma"))
mcmc_violin(as.array(fit), pars = c("(Intercept)", "tot_wat_per","pop_dens", "sigma"))
# bivariate plots
mcmc_scatter(as.array(fit), pars = c( "tot_wat_per","pop_dens"))
mcmc_hex(as.array(fit), pars = c( "tot_wat_per","pop_dens"))
mcmc_pairs(as.array(fit), pars = c("(Intercept)", "tot_wat_per","pop_dens", "sigma"))
# trace plots
mcmc_trace(as.array(fit), pars = c("(Intercept)", "tot_wat_per","pop_dens", "sigma"))
mcmc_trace(as.array(fit), pars = c("(Intercept)", "tot_wat_per","pop_dens", "sigma"),
facet_args = list(ncol = 1, strip.position = "left"))
posterior_draws <- posterior_predict(fit, draws = 100)
ppc_dens_overlay(y = fit$data$dam_cap_per, yrep = posterior_draws)
pp_check(fit, plotfun = "hist", nreps = 5)
pp_check(fit, plotfun = "stat", stat = "mean")
pp_check(fit, plotfun = "stat", stat = "median")
#Distributions of test statistics
prop_zero <- function(x) mean(x <= 0)
prop_zero(fit$data$dam_cap_per)
prop_zero(posterior_draws)
ppc_stat(fit$data$dam_cap_per, posterior_draws, stat = "prop_zero", binwidth = 0.005)